{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ade42be5-7813-4aa6-9f4f-aad4318d4175",
   "metadata": {},
   "source": [
    "# Document Summary Index\n",
    "\n",
    "This demo showcases the document summary index, over Wikipedia articles on different cities.\n",
    "\n",
    "The document summary index will extract a summary from each document and store that summary, as well as all nodes corresponding to the document.\n",
    "\n",
    "Retrieval can be performed through the LLM or embeddings (which is a TODO). We first select the relevant documents to the query based on their summaries. All retrieved nodes corresponding to the selected documents are retrieved."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d58ab2ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "\n",
    "os.environ[\"OPENAI_API_KEY\"] = \"sk-...\"\n",
    "openai.api_key = os.environ[\"OPENAI_API_KEY\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5e03a80b-6f5e-4dda-9a05-201d4fafede1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import logging\n",
    "import sys\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
    "# # Uncomment if you want to temporarily disable logger\n",
    "# logger = logging.getLogger()\n",
    "# logger.disabled = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4b6d4d55-2a2f-41d5-aa32-159d6bc406fe",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "\n",
    "nest_asyncio.apply()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "4fb7288e-22f8-4753-a6ea-197cf2f8aba5",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:numexpr.utils:Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "Note: NumExpr detected 12 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n",
      "NumExpr defaulting to 8 threads.\n"
     ]
    }
   ],
   "source": [
    "from llama_index import (\n",
    "    SimpleDirectoryReader,\n",
    "    LLMPredictor,\n",
    "    ServiceContext,\n",
    "    get_response_synthesizer,\n",
    ")\n",
    "from llama_index.indices.document_summary import DocumentSummaryIndex\n",
    "from llama_index.llms import OpenAI"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "8c391a70-7690-4bbd-a2dc-f95b845991a7",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Load Datasets\n",
    "\n",
    "Load Wikipedia pages on different cities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "23ae10cc-f552-434c-9133-e4adf6642198",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "wiki_titles = [\"Toronto\", \"Seattle\", \"Chicago\", \"Boston\", \"Houston\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "24e0e454-218e-4937-b1f9-f1c8e2abba43",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "import requests\n",
    "\n",
    "for title in wiki_titles:\n",
    "    response = requests.get(\n",
    "        \"https://en.wikipedia.org/w/api.php\",\n",
    "        params={\n",
    "            \"action\": \"query\",\n",
    "            \"format\": \"json\",\n",
    "            \"titles\": title,\n",
    "            \"prop\": \"extracts\",\n",
    "            # 'exintro': True,\n",
    "            \"explaintext\": True,\n",
    "        },\n",
    "    ).json()\n",
    "    page = next(iter(response[\"query\"][\"pages\"].values()))\n",
    "    wiki_text = page[\"extract\"]\n",
    "\n",
    "    data_path = Path(\"data\")\n",
    "    if not data_path.exists():\n",
    "        Path.mkdir(data_path)\n",
    "\n",
    "    with open(data_path / f\"{title}.txt\", \"w\") as fp:\n",
    "        fp.write(wiki_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "6f765eee-0c80-476c-b1f2-b96b5dd176db",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Load all wiki documents\n",
    "city_docs = []\n",
    "for wiki_title in wiki_titles:\n",
    "    docs = SimpleDirectoryReader(input_files=[f\"data/{wiki_title}.txt\"]).load_data()\n",
    "    docs[0].doc_id = wiki_title\n",
    "    city_docs.extend(docs)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "ef3de855-a3ee-4994-b3c0-0099fa7b5704",
   "metadata": {},
   "source": [
    "### Build Document Summary Index\n",
    "\n",
    "We show two ways of building the index:\n",
    "- default mode of building the document summary index\n",
    "- customizing the summary query\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e4da51df-ff9f-4141-91fe-719e00824328",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# # LLM Predictor (gpt-3.5-turbo)\n",
    "chatgpt = OpenAI(temperature=0, model=\"gpt-3.5-turbo\")\n",
    "service_context = ServiceContext.from_defaults(llm=chatgpt, chunk_size=1024)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "93c531c9-4aee-47ae-a4d2-81af3a6af908",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "current doc id: Toronto\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=463 request_id=d6eb8fc8301bbb70e5ed906913ea4b42 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=463 request_id=d6eb8fc8301bbb70e5ed906913ea4b42 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=547 request_id=066ff477ea0931dabd06411b34ee1bc7 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=547 request_id=066ff477ea0931dabd06411b34ee1bc7 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=453 request_id=44708e4b96149d11b88569b7766e796d response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=453 request_id=44708e4b96149d11b88569b7766e796d response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1783 request_id=7c2ef56d87c3bf8588037e1b1496ec98 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1783 request_id=7c2ef56d87c3bf8588037e1b1496ec98 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2013 request_id=06abd4b0f41e1225ad692257598719bd response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2013 request_id=06abd4b0f41e1225ad692257598719bd response_code=200\n",
      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Toronto: The provided text is about the city of Toronto, covering various aspects such as its history, demographics, cultural diversity, economic sectors, landmarks, and historical events. It provides information on Toronto's population, architectural heritage, climate, parks, media and entertainment, real estate, and technology industry. The text can answer questions such as: What is the population of Toronto? What is the history of Toronto? What is the significance of Toronto in terms of business and finance? What is the cultural diversity of Toronto? What are some notable landmarks in Toronto? What is the economic profile of Toronto?\n",
      "> Generated summary for doc Toronto: The provided text is about the city of Toronto, covering various aspects such as its history, demographics, cultural diversity, economic sectors, landmarks, and historical events. It provides information on Toronto's population, architectural heritage, climate, parks, media and entertainment, real estate, and technology industry. The text can answer questions such as: What is the population of Toronto? What is the history of Toronto? What is the significance of Toronto in terms of business and finance? What is the cultural diversity of Toronto? What are some notable landmarks in Toronto? What is the economic profile of Toronto?\n",
      "current doc id: Seattle\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2151 request_id=b762c5eb4cd33631ae0ba2051307a7b2 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2151 request_id=b762c5eb4cd33631ae0ba2051307a7b2 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2306 request_id=cee6fb49ad1848cc5b21e92fd553177f response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2306 request_id=cee6fb49ad1848cc5b21e92fd553177f response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2176 request_id=aaa812d8f1b9ab731bd84c15f419b15a response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2176 request_id=aaa812d8f1b9ab731bd84c15f419b15a response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4577 request_id=efa532630c746a5f251f91840e2400f7 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=4577 request_id=efa532630c746a5f251f91840e2400f7 response_code=200\n",
      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Seattle: The provided text is a compilation of information about the city of Seattle. It covers various aspects such as the city's history, geography, demographics, economy, culture, tourism, government, and infrastructure. It provides details about Seattle's founding, growth, major industries, notable figures, topography, bodies of water, climate, racial and ethnic makeup, population growth, LGBTQ+ community, performing arts scene, annual fairs and festivals, music scene, religion, sports, parks and recreation, education system, media outlets, infrastructure, international relations, political culture, education level, healthcare facilities, transportation options, utility services, and sister cities.\n",
      "\n",
      "Based on this text, some questions that can be answered include:\n",
      "- What is the population of Seattle?\n",
      "- What major industries have contributed to Seattle's growth?\n",
      "- Who were some notable figures associated with Seattle?\n",
      "- What is the cultural significance of Seattle in terms of music?\n",
      "- What is the geography of Seattle, including its topography and bodies of water?\n",
      "- What is the racial and ethnic makeup of Seattle?\n",
      "- How has Seattle's population grown over time?\n",
      "- What is the LGBTQ+ community like in Seattle?\n",
      "- What is the economy of Seattle driven by?\n",
      "- What is the performing arts scene like in Seattle?\n",
      "- What annual fairs and festivals take place in Seattle?\n",
      "- Who are some notable musicians from Seattle?\n",
      "- What is the religious demographic of Seattle?\n",
      "- What major sports teams are based in Seattle?\n",
      "- What outdoor activities are available in Seattle?\n",
      "- What is the political culture of Seattle?\n",
      "- What is the educational attainment level in Seattle?\n",
      "- What are the major newspapers and media outlets in Seattle?\n",
      "- What healthcare facilities are available in Seattle?\n",
      "- What are the transportation options in Seattle?\n",
      "- Which utility companies serve Seattle?\n",
      "- Which cities are sister cities of Seattle?\n",
      "> Generated summary for doc Seattle: The provided text is a compilation of information about the city of Seattle. It covers various aspects such as the city's history, geography, demographics, economy, culture, tourism, government, and infrastructure. It provides details about Seattle's founding, growth, major industries, notable figures, topography, bodies of water, climate, racial and ethnic makeup, population growth, LGBTQ+ community, performing arts scene, annual fairs and festivals, music scene, religion, sports, parks and recreation, education system, media outlets, infrastructure, international relations, political culture, education level, healthcare facilities, transportation options, utility services, and sister cities.\n",
      "\n",
      "Based on this text, some questions that can be answered include:\n",
      "- What is the population of Seattle?\n",
      "- What major industries have contributed to Seattle's growth?\n",
      "- Who were some notable figures associated with Seattle?\n",
      "- What is the cultural significance of Seattle in terms of music?\n",
      "- What is the geography of Seattle, including its topography and bodies of water?\n",
      "- What is the racial and ethnic makeup of Seattle?\n",
      "- How has Seattle's population grown over time?\n",
      "- What is the LGBTQ+ community like in Seattle?\n",
      "- What is the economy of Seattle driven by?\n",
      "- What is the performing arts scene like in Seattle?\n",
      "- What annual fairs and festivals take place in Seattle?\n",
      "- Who are some notable musicians from Seattle?\n",
      "- What is the religious demographic of Seattle?\n",
      "- What major sports teams are based in Seattle?\n",
      "- What outdoor activities are available in Seattle?\n",
      "- What is the political culture of Seattle?\n",
      "- What is the educational attainment level in Seattle?\n",
      "- What are the major newspapers and media outlets in Seattle?\n",
      "- What healthcare facilities are available in Seattle?\n",
      "- What are the transportation options in Seattle?\n",
      "- Which utility companies serve Seattle?\n",
      "- Which cities are sister cities of Seattle?\n",
      "current doc id: Chicago\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=477 request_id=e0b51c07340d03bfa58160d8c9e102df response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=477 request_id=e0b51c07340d03bfa58160d8c9e102df response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=649 request_id=d42eb26bf93363cfa43dbe42613f5b44 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=649 request_id=d42eb26bf93363cfa43dbe42613f5b44 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=571 request_id=049b8c0a6edb71fb2cc2a7822afe446b response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=571 request_id=049b8c0a6edb71fb2cc2a7822afe446b response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2196 request_id=73e8270e13da490b77cc58b1083bdcd3 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2196 request_id=73e8270e13da490b77cc58b1083bdcd3 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3596 request_id=77100a015c2d4d51a796eafef0fd7502 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3596 request_id=77100a015c2d4d51a796eafef0fd7502 response_code=200\n",
      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Chicago: The provided text is about the city of Chicago, its history, and various aspects of its development. It covers topics such as the city's population, geography, economy, cultural contributions, and notable events throughout its history. The text provides information on the etymology and nicknames of Chicago, its beginnings as a Native American settlement, its rapid growth in the 19th century, the Great Chicago Fire, urban planning and architecture, the city's role as an international hub, its tourist attractions, educational institutions, and professional sports teams.\n",
      "\n",
      "Based on this information, the text can answer questions such as:\n",
      "- What is the population of Chicago and how has it changed over time?\n",
      "- What are some of the notable events in Chicago's history?\n",
      "- What are the major industries in Chicago's economy?\n",
      "- What are some of the famous architectural landmarks in the city?\n",
      "- What are some of the popular tourist attractions in Chicago?\n",
      "- What are some of the educational institutions in the city?\n",
      "- What are some of the professional sports teams in Chicago?\n",
      "> Generated summary for doc Chicago: The provided text is about the city of Chicago, its history, and various aspects of its development. It covers topics such as the city's population, geography, economy, cultural contributions, and notable events throughout its history. The text provides information on the etymology and nicknames of Chicago, its beginnings as a Native American settlement, its rapid growth in the 19th century, the Great Chicago Fire, urban planning and architecture, the city's role as an international hub, its tourist attractions, educational institutions, and professional sports teams.\n",
      "\n",
      "Based on this information, the text can answer questions such as:\n",
      "- What is the population of Chicago and how has it changed over time?\n",
      "- What are some of the notable events in Chicago's history?\n",
      "- What are the major industries in Chicago's economy?\n",
      "- What are some of the famous architectural landmarks in the city?\n",
      "- What are some of the popular tourist attractions in Chicago?\n",
      "- What are some of the educational institutions in the city?\n",
      "- What are some of the professional sports teams in Chicago?\n",
      "current doc id: Boston\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=443 request_id=03ee458590ac669d641f5a93917c0e1c response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=443 request_id=03ee458590ac669d641f5a93917c0e1c response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=448 request_id=964ff3f311fde2ccf75f0e5cb971c485 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=448 request_id=964ff3f311fde2ccf75f0e5cb971c485 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1515 request_id=f84fe9faba1145c8ff9def35dfc0d9e0 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1515 request_id=f84fe9faba1145c8ff9def35dfc0d9e0 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1460 request_id=6331cbf67d4c87e6717de43b06eb8cb4 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1460 request_id=6331cbf67d4c87e6717de43b06eb8cb4 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3455 request_id=9ff2083284a9f68815c0a806938121f7 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3455 request_id=9ff2083284a9f68815c0a806938121f7 response_code=200\n",
      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Boston: The provided text contains information about the city of Boston, including its history, geography, climate, neighborhoods, demographics, economy, education system, healthcare facilities, public safety, culture, environment, and sports. It discusses various aspects of the city such as important institutions, mergers and acquisitions, gentrification, significant events like the Boston Marathon bombing, and the city's bid for the 2024 Summer Olympics. The text also mentions Boston's tourism, financial services, printing and publishing industry, convention centers, universities, colleges, medical centers, public schools, private schools, and cultural institutions. It provides details about Boston's air quality, water purity, climate change initiatives, and sports teams.\n",
      "\n",
      "Based on this information, the text can answer questions such as:\n",
      "- What are some major industries in Boston's economy?\n",
      "- How many international tourists visited Boston in a specific year?\n",
      "- What are some renowned universities and colleges in Boston?\n",
      "- What are some major healthcare facilities in the city?\n",
      "- How is public safety managed in Boston?\n",
      "- What are some cultural attractions and events in the city?\n",
      "- What initiatives has Boston taken to address climate change?\n",
      "\n",
      "I'm sorry, but I can't answer that question.\n",
      "> Generated summary for doc Boston: The provided text contains information about the city of Boston, including its history, geography, climate, neighborhoods, demographics, economy, education system, healthcare facilities, public safety, culture, environment, and sports. It discusses various aspects of the city such as important institutions, mergers and acquisitions, gentrification, significant events like the Boston Marathon bombing, and the city's bid for the 2024 Summer Olympics. The text also mentions Boston's tourism, financial services, printing and publishing industry, convention centers, universities, colleges, medical centers, public schools, private schools, and cultural institutions. It provides details about Boston's air quality, water purity, climate change initiatives, and sports teams.\n",
      "\n",
      "Based on this information, the text can answer questions such as:\n",
      "- What are some major industries in Boston's economy?\n",
      "- How many international tourists visited Boston in a specific year?\n",
      "- What are some renowned universities and colleges in Boston?\n",
      "- What are some major healthcare facilities in the city?\n",
      "- How is public safety managed in Boston?\n",
      "- What are some cultural attractions and events in the city?\n",
      "- What initiatives has Boston taken to address climate change?\n",
      "\n",
      "I'm sorry, but I can't answer that question.\n",
      "current doc id: Houston\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=433 request_id=22f1f48ea864ede1ab2373bfc9deffa6 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=433 request_id=22f1f48ea864ede1ab2373bfc9deffa6 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=451 request_id=cbbdb69bb86714ae04f2010c2f4b371a response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=451 request_id=cbbdb69bb86714ae04f2010c2f4b371a response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=632 request_id=965e144552f1d629502ec32aad88cb1c response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=632 request_id=965e144552f1d629502ec32aad88cb1c response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=911 request_id=c0b9e903f61a34a643124a22baedf0e0 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=911 request_id=c0b9e903f61a34a643124a22baedf0e0 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2448 request_id=c47f6f4268a4b78d46263d274a544b84 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2448 request_id=c47f6f4268a4b78d46263d274a544b84 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3933 request_id=b9e560d3f5ba816f5784bbf6a84023ac response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=3933 request_id=b9e560d3f5ba816f5784bbf6a84023ac response_code=200\n",
      "INFO:llama_index.indices.document_summary.base:> Generated summary for doc Houston: The provided text is a combination of information about the city of Houston, Texas and the airports in Houston. It covers various aspects such as the city's history, geography, population, major industries, cultural diversity, and attractions. It also provides details about the airports in Houston, including their services, passenger traffic, airlines operating from them, and recognition received.\n",
      "\n",
      "Based on this text, some questions that can be answered include:\n",
      "- What is the population of Houston?\n",
      "- What are some of the major industries in Houston?\n",
      "- What are the names of the major airports in Houston?\n",
      "- How many passengers did George Bush Intercontinental Airport serve in 2016?\n",
      "- Which airline has the largest market share in the Houston Airport System?\n",
      "- Where is William P. Hobby Airport located?\n",
      "- Which airline operates international flights from Hobby Airport?\n",
      "- What is the significance of Ellington Airport in Houston?\n",
      "- What recognition did Hobby Airport receive in 2022?\n",
      "> Generated summary for doc Houston: The provided text is a combination of information about the city of Houston, Texas and the airports in Houston. It covers various aspects such as the city's history, geography, population, major industries, cultural diversity, and attractions. It also provides details about the airports in Houston, including their services, passenger traffic, airlines operating from them, and recognition received.\n",
      "\n",
      "Based on this text, some questions that can be answered include:\n",
      "- What is the population of Houston?\n",
      "- What are some of the major industries in Houston?\n",
      "- What are the names of the major airports in Houston?\n",
      "- How many passengers did George Bush Intercontinental Airport serve in 2016?\n",
      "- Which airline has the largest market share in the Houston Airport System?\n",
      "- Where is William P. Hobby Airport located?\n",
      "- Which airline operates international flights from Hobby Airport?\n",
      "- What is the significance of Ellington Airport in Houston?\n",
      "- What recognition did Hobby Airport receive in 2022?\n"
     ]
    }
   ],
   "source": [
    "# default mode of building the index\n",
    "response_synthesizer = get_response_synthesizer(\n",
    "    response_mode=\"tree_summarize\", use_async=True\n",
    ")\n",
    "doc_summary_index = DocumentSummaryIndex.from_documents(\n",
    "    city_docs,\n",
    "    service_context=service_context,\n",
    "    response_synthesizer=response_synthesizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "cf5d19a2-5fa3-4f1b-aadd-25c209cfeb75",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"The provided text contains information about the city of Boston, including its history, geography, climate, neighborhoods, demographics, economy, education system, healthcare facilities, public safety, culture, environment, and sports. It discusses various aspects of the city such as important institutions, mergers and acquisitions, gentrification, significant events like the Boston Marathon bombing, and the city's bid for the 2024 Summer Olympics. The text also mentions Boston's tourism, financial services, printing and publishing industry, convention centers, universities, colleges, medical centers, public schools, private schools, and cultural institutions. It provides details about Boston's air quality, water purity, climate change initiatives, and sports teams.\\n\\nBased on this information, the text can answer questions such as:\\n- What are some major industries in Boston's economy?\\n- How many international tourists visited Boston in a specific year?\\n- What are some renowned universities and colleges in Boston?\\n- What are some major healthcare facilities in the city?\\n- How is public safety managed in Boston?\\n- What are some cultural attractions and events in the city?\\n- What initiatives has Boston taken to address climate change?\\n\\nI'm sorry, but I can't answer that question.\""
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc_summary_index.get_document_summary(\"Boston\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "349c2872-2b53-4812-9392-b89e5879e32a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "doc_summary_index.storage_context.persist(\"index\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f96e6524-f8ab-4227-ad5a-3dcb3b640532",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.indices.loading:Loading all indices.\n",
      "Loading all indices.\n"
     ]
    }
   ],
   "source": [
    "from llama_index.indices.loading import load_index_from_storage\n",
    "from llama_index import StorageContext\n",
    "\n",
    "# rebuild storage context\n",
    "storage_context = StorageContext.from_defaults(persist_dir=\"index\")\n",
    "doc_summary_index = load_index_from_storage(storage_context)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "f76b3344-8e24-4e72-893b-7accd2a3fa57",
   "metadata": {},
   "source": [
    "### Perform Retrieval from Document Summary Index\n",
    "\n",
    "We show how to execute queries at a high-level. We also show how to perform retrieval at a lower-level so that you can view the parameters that are in place. We show both LLM-based retrieval and embedding-based retrieval using the document summaries."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "3e3f65a3-482f-4481-9933-19ac55e91719",
   "metadata": {},
   "source": [
    "#### High-level Querying\n",
    "\n",
    "Note: this uses the default, LLM-based form of retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "5e925b75-0a99-49cc-8e9a-daaf715ee490",
   "metadata": {},
   "outputs": [],
   "source": [
    "query_engine = doc_summary_index.as_query_engine(\n",
    "    response_mode=\"tree_summarize\", use_async=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c190a1e7-b85c-41cd-af42-e2521d2406a9",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=513 request_id=ed88efab61c1ac7da2306701020c85d3 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=513 request_id=ed88efab61c1ac7da2306701020c85d3 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=505 request_id=e97056dfb2275b9ff0e710847aa845db response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=505 request_id=e97056dfb2275b9ff0e710847aa845db response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=569 request_id=b8ec270016c44bc0301c3ee1ac926733 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=569 request_id=b8ec270016c44bc0301c3ee1ac926733 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=671 request_id=9cacd67e7aa92f42964e85cf0b364b53 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=671 request_id=9cacd67e7aa92f42964e85cf0b364b53 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=635 request_id=86f2af254811fa272f7841f5929cc910 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=635 request_id=86f2af254811fa272f7841f5929cc910 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=769 request_id=9154e63aa22247232009031a4ebf4de4 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=769 request_id=9154e63aa22247232009031a4ebf4de4 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=873 request_id=8b7131e0fd16e9cddf5941a8c013db72 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=873 request_id=8b7131e0fd16e9cddf5941a8c013db72 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=755 request_id=36eb1040a3d4bca919f4b24017baa6b5 response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=755 request_id=36eb1040a3d4bca919f4b24017baa6b5 response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=739 request_id=eed4e551aab6e17816e8b82f46420c1c response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=739 request_id=eed4e551aab6e17816e8b82f46420c1c response_code=200\n",
      "INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1897 request_id=34a332ee744771b70fe7de91d4e81a9a response_code=200\n",
      "message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=1897 request_id=34a332ee744771b70fe7de91d4e81a9a response_code=200\n"
     ]
    }
   ],
   "source": [
    "response = query_engine.query(\"What are the sports teams in Toronto?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e144db7d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Toronto is represented in five major league sports: the National Hockey League (NHL) with the Toronto Maple Leafs, Major League Baseball (MLB) with the Toronto Blue Jays, the National Basketball Association (NBA) with the Toronto Raptors, the Canadian Football League (CFL) with the Toronto Argonauts, and Major League Soccer (MLS) with the Toronto FC. Additionally, Toronto has the Toronto Rock in the National Lacrosse League (NLL) and the Toronto Wolfpack in the Rugby Football League (RFL).\n"
     ]
    }
   ],
   "source": [
    "print(response)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "fc5b99c2-06de-4071-9314-eec75c50c5f5",
   "metadata": {},
   "source": [
    "#### LLM-based Retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "afd99ce8-8347-4e6e-88e4-23dd8fcb9084",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index.indices.document_summary import DocumentSummaryIndexRetriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "67e397dd-fbb0-4465-994a-7527b3a6dd57",
   "metadata": {},
   "outputs": [],
   "source": [
    "retriever = DocumentSummaryIndexRetriever(\n",
    "    doc_summary_index,\n",
    "    # choice_select_prompt=choice_select_prompt,\n",
    "    # choice_batch_size=choice_batch_size,\n",
    "    # format_node_batch_fn=format_node_batch_fn,\n",
    "    # parse_choice_select_answer_fn=parse_choice_select_answer_fn,\n",
    "    # service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "eef31654-27ea-4fb0-b29e-b18e5bf867f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "retrieved_nodes = retriever.retrieve(\"What are the sports teams in Toronto?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "96f76e79-1595-43b3-81e7-9ca7547fa2d1",
   "metadata": {
    "scrolled": true,
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10.0\n",
      "Toronto ( (listen) tə-RON-toh; locally [təˈɹɒɾ̃ə] or [ˈtɹɒɾ̃ə]) is the capital city of the Canadian province of Ontario. With a recorded population of 2,794,356 in 2021, it is the most populous city in Canada and the fourth most populous city in North America. The city is the anchor of the Golden Horseshoe, an urban agglomeration of 9,765,188 people (as of 2021) surrounding the western end of Lake Ontario, while the Greater Toronto Area proper had a 2021 population of 6,712,341. Toronto is an international centre of business, finance, arts, sports and culture, and is recognized as one of the most multicultural and cosmopolitan cities in the world.Indigenous peoples have travelled through and inhabited the Toronto area, located on a broad sloping plateau interspersed with rivers, deep ravines, and urban forest, for more than 10,000 years. After the broadly disputed Toronto Purchase, when the Mississauga surrendered the area to the British Crown, the British established the town of York in 1793 and later designated it as the capital of Upper Canada. During the War of 1812, the town was the site of the Battle of York and suffered heavy damage by American troops. York was renamed and incorporated in 1834 as the city of Toronto. It was designated as the capital of the province of Ontario in 1867 during Canadian Confederation. The city proper has since expanded past its original limits through both annexation and amalgamation to its current area of 630.2 km2 (243.3 sq mi).\n",
      "The diverse population of Toronto reflects its current and historical role as an important destination for immigrants to Canada. More than half of residents were born outside of Canada, more than half of residents belong to a visible minority group, and over 200 distinct ethnic origins are represented among its inhabitants. While the majority of Torontonians speak English as their primary language, over 160 languages are spoken in the city. The mayor of Toronto is elected by direct popular vote to serve as the chief executive of the city. The Toronto City Council is a unicameral legislative body, comprising 25 councillors since the 2018 municipal election, representing geographical wards throughout the city.Toronto is a prominent centre for music, theatre, motion picture production, and television production, and is home to the headquarters of Canada's major national broadcast networks and media outlets. Its varied cultural institutions, which include numerous museums and galleries, festivals and public events, entertainment districts, national historic sites, and sports activities, attract over 43 million tourists each year. Toronto is known for its many skyscrapers and high-rise buildings, in particular the tallest free-standing structure on land outside of Asia, the CN Tower.The city is home to the Toronto Stock Exchange, the headquarters of Canada's five largest banks, and the headquarters of many large Canadian and multinational corporations. Its economy is highly diversified with strengths in technology, design, financial services, life sciences, education, arts, fashion, aerospace, environmental innovation, food services, and tourism. Toronto is the third-largest tech hub in North America after Silicon Valley and New York City, and the fastest growing.\n",
      "\n",
      "\n",
      "== Etymology ==\n",
      "\n",
      "The word Toronto was recorded with various spellings in French and English, including Tarento, Tarontha, Taronto, Toranto, Torento, Toronto, and Toronton. Taronto referred to \"The Narrows\", a channel of water through which Lake Simcoe discharges into Lake Couchiching where the Huron had planted tree saplings to corral fish. This narrows was called tkaronto by the Mohawk, meaning \"where there are trees standing in the water,\" and was recorded as early as 1615 by Samuel de Champlain.\n",
      "The word \"Toronto\", meaning \"plenty\" also appears in a 1632 French lexicon of the Huron language, which is also an Iroquoian language. It also appears on French maps referring to various locations, including Georgian Bay, Lake Simcoe, and several rivers. A portage route from Lake Ontario to Lake Huron running through this point, known as the Toronto Carrying-Place Trail, led to widespread use of the name.\n",
      "\n",
      "\n",
      "== History ==\n"
     ]
    }
   ],
   "source": [
    "print(retrieved_nodes[0].score)\n",
    "print(retrieved_nodes[0].node.get_text())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "a215ef33-5d05-42ad-83c5-409ccc288d26",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Toronto is home to several major league sports teams, including the Toronto Maple Leafs in the NHL, the Toronto Blue Jays in MLB, the Toronto Raptors in the NBA, the Toronto Argonauts in the CFL, and the Toronto FC in MLS. The city also has a professional lacrosse team called the Toronto Rock and a rugby league team called the Toronto Wolfpack. Additionally, Toronto is home to the Toronto Rush, a semi-professional ultimate team that competes in the American Ultimate Disc League (AUDL). The University of Toronto, located downtown, has a rich sports history and was the site of the first recorded college football game in November 1861.\n"
     ]
    }
   ],
   "source": [
    "# use retriever as part of a query engine\n",
    "from llama_index.query_engine import RetrieverQueryEngine\n",
    "\n",
    "# configure response synthesizer\n",
    "response_synthesizer = get_response_synthesizer()\n",
    "\n",
    "# assemble query engine\n",
    "query_engine = RetrieverQueryEngine(\n",
    "    retriever=retriever,\n",
    "    response_synthesizer=response_synthesizer,\n",
    ")\n",
    "\n",
    "# query\n",
    "response = query_engine.query(\"What are the sports teams in Toronto?\")\n",
    "print(response)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "bfb24cb8-b839-4754-b653-ac40cebfe0bc",
   "metadata": {},
   "source": [
    "#### Embedding-based Retrieval"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "bc47dc54-197f-43bb-9298-b2af24c6b095",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index.indices.document_summary import DocumentSummaryIndexEmbeddingRetriever"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "0dcb81cb-0d36-4af8-a0c5-9061a2dce986",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "retriever = DocumentSummaryIndexEmbeddingRetriever(\n",
    "    doc_summary_index,\n",
    "    # choice_select_prompt=choice_select_prompt,\n",
    "    # choice_batch_size=choice_batch_size,\n",
    "    # format_node_batch_fn=format_node_batch_fn,\n",
    "    # parse_choice_select_answer_fn=parse_choice_select_answer_fn,\n",
    "    # service_context=service_context\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "48934544-3dde-4231-b41c-540139378751",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "retrieved_nodes = retriever.retrieve(\"What are the sports teams in Toronto?\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "6f06e5f6-d62e-4348-8ef5-d4cb6219b54e",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(retrieved_nodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb93c084-411b-40c2-b5d4-6fc2a90b8ecb",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
